library(pheatmap)
library(data.table)
library(RColorBrewer)
library(openxlsx)
# Load crosscheck matrix
data <- as.data.frame(fread(CrossCheckOUTPUT, stringsAsFactors = F))

cat("Original data dimensions:", dim(data), "\n")
## Original data dimensions: 776 777
cat("Number of unique READGROUPS:", length(unique(as.factor(data$READGROUP))), "\n")
## Number of unique READGROUPS: 776

Data Preprocessing

# Remove self compare and specific columns
data = data[data$READGROUP %like% "_",]
data = data[,-which(names(data) %like% "_")]

# Load crosscheck metadata
crosscheck2 <- as.data.frame(fread(CrossCheckOUTPUT2))
crosscheck2 = crosscheck2[,c("LEFT_GROUP_VALUE","LEFT_SAMPLE" )]
crosscheck2 = crosscheck2[!duplicated(crosscheck2$LEFT_GROUP_VALUE) & !crosscheck2$LEFT_GROUP_VALUE %like% "_",]

# Load sample metadata
SAMPLE.EQTL <- read.xlsx(SAMPLE_SHEET, sheet=1)
SAMPLE.EQTL = SAMPLE.EQTL[SAMPLE.EQTL$TB_Status %in% "Latent TB",]
SAMPLE.EQTL = SAMPLE.EQTL[,c("genotype.ID_corrected","RNA.Sequencing_ID")]

# Merge and reorganize data
crosscheck3 = merge(crosscheck2, SAMPLE.EQTL, by.x="LEFT_SAMPLE",by.y="genotype.ID_corrected")
crosscheck3 = crosscheck3[order(crosscheck3$RNA.Sequencing_ID),]

data2 = data[data$READGROUP %in% crosscheck3$RNA.Sequencing_ID, c("READGROUP",crosscheck3$LEFT_GROUP_VALUE)]
names(data2)[2:ncol(data2)][match(crosscheck3$LEFT_GROUP_VALUE, names(data2[2:ncol(data2)]))] <- crosscheck3$RNA.Sequencing_ID

cat("Processed data dimensions:", dim(data2), "\n")
## Processed data dimensions: 213 214

Prepare matrix for Visualisation

# Set row names and convert to numeric matrix
row.names(data2) <- data2$READGROUP
data2 <- data2[,-1]

# Convert to numeric matrix
data2[] <- lapply(data2, function(x) as.numeric(as.character(gsub(",","",x))))

# Order columns and rows
data2 <- data2[, order(colnames(data2))]
data2 <- data2[order(rownames(data2)), ]

data_matrix <- data.matrix(data2)

# Apply thresholds for better visualization
data_matrix[data_matrix > 100] <- 100
data_matrix[data_matrix < -100] <- -100

cat("Final matrix dimensions:", dim(data_matrix), "\n")
## Final matrix dimensions: 213 213

Visualize Results

Heatmap Visualisation

# Create heatmap with improved formatting
pheatmap(data_matrix, 
         col = colorRampPalette(brewer.pal(4, "YlGn"))(10),
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         fontsize_row = 6,
         fontsize_col = 6,
         main = "DNA-RNA Crosscheck Matrix\n(TB Samples)",
         show_rownames = TRUE,
         show_colnames = TRUE)

Summary Statistics

# Calculate summary statistics
summary_stats <- data.frame(
  Min = min(data_matrix, na.rm = TRUE),
  Max = max(data_matrix, na.rm = TRUE),
  Mean = mean(data_matrix, na.rm = TRUE),
  Median = median(data_matrix, na.rm = TRUE),
  SD = sd(data_matrix, na.rm = TRUE)
)

print(summary_stats)
##    Min Max     Mean Median       SD
## 1 -100 100 -99.1352   -100 12.52923

Sample Overview

cat("Number of RNA samples:", nrow(data_matrix), "\n")
## Number of RNA samples: 213
cat("Number of genotype samples:", ncol(data_matrix), "\n")
## Number of genotype samples: 213
cat("Sample names (first 10):\n")
## Sample names (first 10):
print(head(rownames(data_matrix), 10))
##  [1] "BH0022_S2"  "BH0046_S10" "BH0047_S3"  "BH0049_S4"  "BH0050_S5" 
##  [6] "BH0051_S6"  "BH0054_S8"  "BH0055_S9"  "BH0059_S12" "BH0069_S6"

Session Information

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] openxlsx_4.2.8     RColorBrewer_1.1-3 data.table_1.16.2  pheatmap_1.0.13   
## 
## loaded via a namespace (and not attached):
##  [1] zip_2.3.0        cli_3.6.3        knitr_1.45       rlang_1.1.4     
##  [5] xfun_0.41        highr_0.10       stringi_1.8.4    jsonlite_1.8.7  
##  [9] glue_1.8.0       colorspace_2.1-1 htmltools_0.5.7  sass_0.4.7      
## [13] scales_1.3.0     rmarkdown_2.25   grid_4.3.2       evaluate_0.23   
## [17] munsell_0.5.1    jquerylib_0.1.4  fastmap_1.1.1    yaml_2.3.7      
## [21] lifecycle_1.0.4  compiler_4.3.2   Rcpp_1.0.13-1    digest_0.6.33   
## [25] R6_2.5.1         bslib_0.5.1      tools_4.3.2      gtable_0.3.6    
## [29] cachem_1.0.8